Library Loading for the analysis

suppressMessages(
suppressWarnings(
  c(library(data.table),
    library(tidyverse),
    library(rtracklayer),
    library(DESeq2),
    library(Biostrings),
    library(Rsubread),
    library(pheatmap),
    library(gridExtra),
    library(ggrepel),
    library(CLIPanalyze),
    library(ggplot2),
    library(RColorBrewer),
    library(mixOmics),
    library(plotly))
  )
)
##   [1] "data.table"           "stats"                "graphics"            
##   [4] "grDevices"            "utils"                "datasets"            
##   [7] "methods"              "base"                 "forcats"             
##  [10] "stringr"              "dplyr"                "purrr"               
##  [13] "readr"                "tidyr"                "tibble"              
##  [16] "ggplot2"              "tidyverse"            "data.table"          
##  [19] "stats"                "graphics"             "grDevices"           
##  [22] "utils"                "datasets"             "methods"             
##  [25] "base"                 "rtracklayer"          "GenomicRanges"       
##  [28] "GenomeInfoDb"         "IRanges"              "S4Vectors"           
##  [31] "BiocGenerics"         "parallel"             "stats4"              
##  [34] "forcats"              "stringr"              "dplyr"               
##  [37] "purrr"                "readr"                "tidyr"               
##  [40] "tibble"               "ggplot2"              "tidyverse"           
##  [43] "data.table"           "stats"                "graphics"            
##  [46] "grDevices"            "utils"                "datasets"            
##  [49] "methods"              "base"                 "DESeq2"              
##  [52] "SummarizedExperiment" "DelayedArray"         "BiocParallel"        
##  [55] "matrixStats"          "Biobase"              "rtracklayer"         
##  [58] "GenomicRanges"        "GenomeInfoDb"         "IRanges"             
##  [61] "S4Vectors"            "BiocGenerics"         "parallel"            
##  [64] "stats4"               "forcats"              "stringr"             
##  [67] "dplyr"                "purrr"                "readr"               
##  [70] "tidyr"                "tibble"               "ggplot2"             
##  [73] "tidyverse"            "data.table"           "stats"               
##  [76] "graphics"             "grDevices"            "utils"               
##  [79] "datasets"             "methods"              "base"                
##  [82] "Biostrings"           "XVector"              "DESeq2"              
##  [85] "SummarizedExperiment" "DelayedArray"         "BiocParallel"        
##  [88] "matrixStats"          "Biobase"              "rtracklayer"         
##  [91] "GenomicRanges"        "GenomeInfoDb"         "IRanges"             
##  [94] "S4Vectors"            "BiocGenerics"         "parallel"            
##  [97] "stats4"               "forcats"              "stringr"             
## [100] "dplyr"                "purrr"                "readr"               
## [103] "tidyr"                "tibble"               "ggplot2"             
## [106] "tidyverse"            "data.table"           "stats"               
## [109] "graphics"             "grDevices"            "utils"               
## [112] "datasets"             "methods"              "base"                
## [115] "Rsubread"             "Biostrings"           "XVector"             
## [118] "DESeq2"               "SummarizedExperiment" "DelayedArray"        
## [121] "BiocParallel"         "matrixStats"          "Biobase"             
## [124] "rtracklayer"          "GenomicRanges"        "GenomeInfoDb"        
## [127] "IRanges"              "S4Vectors"            "BiocGenerics"        
## [130] "parallel"             "stats4"               "forcats"             
## [133] "stringr"              "dplyr"                "purrr"               
## [136] "readr"                "tidyr"                "tibble"              
## [139] "ggplot2"              "tidyverse"            "data.table"          
## [142] "stats"                "graphics"             "grDevices"           
## [145] "utils"                "datasets"             "methods"             
## [148] "base"                 "pheatmap"             "Rsubread"            
## [151] "Biostrings"           "XVector"              "DESeq2"              
## [154] "SummarizedExperiment" "DelayedArray"         "BiocParallel"        
## [157] "matrixStats"          "Biobase"              "rtracklayer"         
## [160] "GenomicRanges"        "GenomeInfoDb"         "IRanges"             
## [163] "S4Vectors"            "BiocGenerics"         "parallel"            
## [166] "stats4"               "forcats"              "stringr"             
## [169] "dplyr"                "purrr"                "readr"               
## [172] "tidyr"                "tibble"               "ggplot2"             
## [175] "tidyverse"            "data.table"           "stats"               
## [178] "graphics"             "grDevices"            "utils"               
## [181] "datasets"             "methods"              "base"                
## [184] "gridExtra"            "pheatmap"             "Rsubread"            
## [187] "Biostrings"           "XVector"              "DESeq2"              
## [190] "SummarizedExperiment" "DelayedArray"         "BiocParallel"        
## [193] "matrixStats"          "Biobase"              "rtracklayer"         
## [196] "GenomicRanges"        "GenomeInfoDb"         "IRanges"             
## [199] "S4Vectors"            "BiocGenerics"         "parallel"            
## [202] "stats4"               "forcats"              "stringr"             
## [205] "dplyr"                "purrr"                "readr"               
## [208] "tidyr"                "tibble"               "ggplot2"             
## [211] "tidyverse"            "data.table"           "stats"               
## [214] "graphics"             "grDevices"            "utils"               
## [217] "datasets"             "methods"              "base"                
## [220] "ggrepel"              "gridExtra"            "pheatmap"            
## [223] "Rsubread"             "Biostrings"           "XVector"             
## [226] "DESeq2"               "SummarizedExperiment" "DelayedArray"        
## [229] "BiocParallel"         "matrixStats"          "Biobase"             
## [232] "rtracklayer"          "GenomicRanges"        "GenomeInfoDb"        
## [235] "IRanges"              "S4Vectors"            "BiocGenerics"        
## [238] "parallel"             "stats4"               "forcats"             
## [241] "stringr"              "dplyr"                "purrr"               
## [244] "readr"                "tidyr"                "tibble"              
## [247] "ggplot2"              "tidyverse"            "data.table"          
## [250] "stats"                "graphics"             "grDevices"           
## [253] "utils"                "datasets"             "methods"             
## [256] "base"                 "CLIPanalyze"          "GenomicAlignments"   
## [259] "Rsamtools"            "GenomicFeatures"      "AnnotationDbi"       
## [262] "plyr"                 "ggrepel"              "gridExtra"           
## [265] "pheatmap"             "Rsubread"             "Biostrings"          
## [268] "XVector"              "DESeq2"               "SummarizedExperiment"
## [271] "DelayedArray"         "BiocParallel"         "matrixStats"         
## [274] "Biobase"              "rtracklayer"          "GenomicRanges"       
## [277] "GenomeInfoDb"         "IRanges"              "S4Vectors"           
## [280] "BiocGenerics"         "parallel"             "stats4"              
## [283] "forcats"              "stringr"              "dplyr"               
## [286] "purrr"                "readr"                "tidyr"               
## [289] "tibble"               "ggplot2"              "tidyverse"           
## [292] "data.table"           "stats"                "graphics"            
## [295] "grDevices"            "utils"                "datasets"            
## [298] "methods"              "base"                 "CLIPanalyze"         
## [301] "GenomicAlignments"    "Rsamtools"            "GenomicFeatures"     
## [304] "AnnotationDbi"        "plyr"                 "ggrepel"             
## [307] "gridExtra"            "pheatmap"             "Rsubread"            
## [310] "Biostrings"           "XVector"              "DESeq2"              
## [313] "SummarizedExperiment" "DelayedArray"         "BiocParallel"        
## [316] "matrixStats"          "Biobase"              "rtracklayer"         
## [319] "GenomicRanges"        "GenomeInfoDb"         "IRanges"             
## [322] "S4Vectors"            "BiocGenerics"         "parallel"            
## [325] "stats4"               "forcats"              "stringr"             
## [328] "dplyr"                "purrr"                "readr"               
## [331] "tidyr"                "tibble"               "ggplot2"             
## [334] "tidyverse"            "data.table"           "stats"               
## [337] "graphics"             "grDevices"            "utils"               
## [340] "datasets"             "methods"              "base"                
## [343] "RColorBrewer"         "CLIPanalyze"          "GenomicAlignments"   
## [346] "Rsamtools"            "GenomicFeatures"      "AnnotationDbi"       
## [349] "plyr"                 "ggrepel"              "gridExtra"           
## [352] "pheatmap"             "Rsubread"             "Biostrings"          
## [355] "XVector"              "DESeq2"               "SummarizedExperiment"
## [358] "DelayedArray"         "BiocParallel"         "matrixStats"         
## [361] "Biobase"              "rtracklayer"          "GenomicRanges"       
## [364] "GenomeInfoDb"         "IRanges"              "S4Vectors"           
## [367] "BiocGenerics"         "parallel"             "stats4"              
## [370] "forcats"              "stringr"              "dplyr"               
## [373] "purrr"                "readr"                "tidyr"               
## [376] "tibble"               "ggplot2"              "tidyverse"           
## [379] "data.table"           "stats"                "graphics"            
## [382] "grDevices"            "utils"                "datasets"            
## [385] "methods"              "base"                 "mixOmics"            
## [388] "lattice"              "MASS"                 "RColorBrewer"        
## [391] "CLIPanalyze"          "GenomicAlignments"    "Rsamtools"           
## [394] "GenomicFeatures"      "AnnotationDbi"        "plyr"                
## [397] "ggrepel"              "gridExtra"            "pheatmap"            
## [400] "Rsubread"             "Biostrings"           "XVector"             
## [403] "DESeq2"               "SummarizedExperiment" "DelayedArray"        
## [406] "BiocParallel"         "matrixStats"          "Biobase"             
## [409] "rtracklayer"          "GenomicRanges"        "GenomeInfoDb"        
## [412] "IRanges"              "S4Vectors"            "BiocGenerics"        
## [415] "parallel"             "stats4"               "forcats"             
## [418] "stringr"              "dplyr"                "purrr"               
## [421] "readr"                "tidyr"                "tibble"              
## [424] "ggplot2"              "tidyverse"            "data.table"          
## [427] "stats"                "graphics"             "grDevices"           
## [430] "utils"                "datasets"             "methods"             
## [433] "base"                 "plotly"               "mixOmics"            
## [436] "lattice"              "MASS"                 "RColorBrewer"        
## [439] "CLIPanalyze"          "GenomicAlignments"    "Rsamtools"           
## [442] "GenomicFeatures"      "AnnotationDbi"        "plyr"                
## [445] "ggrepel"              "gridExtra"            "pheatmap"            
## [448] "Rsubread"             "Biostrings"           "XVector"             
## [451] "DESeq2"               "SummarizedExperiment" "DelayedArray"        
## [454] "BiocParallel"         "matrixStats"          "Biobase"             
## [457] "rtracklayer"          "GenomicRanges"        "GenomeInfoDb"        
## [460] "IRanges"              "S4Vectors"            "BiocGenerics"        
## [463] "parallel"             "stats4"               "forcats"             
## [466] "stringr"              "dplyr"                "purrr"               
## [469] "readr"                "tidyr"                "tibble"              
## [472] "ggplot2"              "tidyverse"            "data.table"          
## [475] "stats"                "graphics"             "grDevices"           
## [478] "utils"                "datasets"             "methods"             
## [481] "base"

Filtering of all peaks

Load the rds file from all samples CLIP analysis and use MA plot to visuaize that sequence depth are balanced between CLIP and IC libraries, and the peaks that were called.

peak.data<- readRDS("~/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/HEAP-CLIP/HEAP_12232019/Analysis/CLIP_Analysis/joint_analysis/peakdata.2020-03-08.rds")

plotMA(peak.data$gene.counts.nopeaks,
       main = "MA plot for all HEAP vs IC\n in genes outside peaks")

plotMA(peak.data$res.deseq,
       main = "MA plot for all HEAP vs IC in peaks,\none-sided test")

Prepare the peak, assign scores to KRas peaks by adjusted p-value.

peaks.all <- peak.data$peaks
peaks.all <- subset(peaks.all, width > 20)
peaks.all <- subset(peaks.all, log2FC > 0)
peaks.all <- keepStandardChromosomes(peaks.all)
score(peaks.all) <- -log10(peaks.all$padj)
length(peaks.all)
## [1] 6150
summary(width(peaks.all))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    21.0    51.0    55.0    58.9    62.0   167.0
count.threshold <- 10
norm.counts <- counts(peak.data$peak.counts, normalized = TRUE)
norm.counts <- norm.counts[names(peaks.all), ]
colnames(norm.counts)[1:6] <- c(paste0("HF", 4:6), paste0("HFK", 4:6))
colnames(norm.counts)[7:12] <- c(paste0("HF-IC", 4:6), paste0("HFK-IC", 4:6))
selected.peaks <- (rowMeans(norm.counts[, paste0("HF", 4:6)]) > count.threshold) |
    (rowMeans(norm.counts[, paste0("HFK", 4:6)]) > count.threshold)
peaks.all <- peaks.all[selected.peaks, ]

length(peaks.all)
## [1] 3861
summary(width(peaks.all))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   21.00   51.00   55.00   57.66   61.00  163.00
peaks.all.overlaps <- peak.data[[2]]
peaks.all.overlaps <-
    peaks.all.overlaps[name %in% names(peaks.all)]
peaks.all.lncRNA <-
    peaks.all.overlaps[gene_type %in% c("lincRNA", "antisense",
                                              "processed_transcript")]
peaks.all.lncRNA.arbitrary <- peaks.all.lncRNA[, .(name, gene_name)]
peaks.all.lncRNA.arbitrary <-
    unique(peaks.all.lncRNA.arbitrary, by = "name")
peaks.all$lncRNA <- as.character(NA)
peaks.all[peaks.all.lncRNA.arbitrary$name, ]$lncRNA <-
    peaks.all.lncRNA.arbitrary$gene_name
peaks.all$annot <- ifelse(is.na(peaks.all$lncRNA),
                                peaks.all$annot,
                                "lncRNA")

padj.threshold <- 0.05
filter.peaks <- subset(peaks.all, padj < padj.threshold)
score(filter.peaks) <- -log10(filter.peaks$padj)
length(filter.peaks)
## [1] 3300
summary(width(filter.peaks))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   21.00   51.00   54.00   57.29   60.00  163.00
# HF peak filtering
peak.data.hf <-
    peak.data$peak.counts
colnames(peak.data.hf) <- c(paste0("HF", 4:6), paste0("HFK", 4:6), paste0("HF-IC", 4:6), paste0("HFK-IC", 4:6))
peak.data.hf <-
    peak.data.hf[names(peaks.all),
                          c("HF4", "HF5", "HF6", "HF-IC4", "HF-IC5", "HF-IC6")]
design(peak.data.hf) <- ~ condition

peak.data.hf <- DESeq(peak.data.hf)
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
plotMA(peak.data.hf,
       main = "MA plot for HF over input\nin all peaks",
       xlab = "Mean of normalized counts")

res.hf <- results(peak.data.hf)

peaks.all$hf.padj <- as.numeric(NA)
peaks.all$hf.log2FC <- as.numeric(NA)
peaks.all[rownames(res.hf), ]$hf.padj <-
    res.hf$padj
peaks.all[rownames(res.hf), ]$hf.log2FC <-
    res.hf$log2FoldChange

peaks.hf <- subset(peaks.all,
                    padj < padj.threshold & log2FC > 0 &
                    hf.padj < padj.threshold &
                    hf.log2FC > 0)
score(peaks.hf) <- -log10(peaks.hf$hf.padj)
length(peaks.hf)
## [1] 2239
length(subsetByOverlaps(peaks.hf, filter.peaks))
## [1] 2239
summary(width(peaks.hf))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   21.00   51.00   54.00   56.64   59.00  163.00
peak.data.hfk <-
    peak.data$peak.counts
colnames(peak.data.hfk) <- c(paste0("HF", 4:6), paste0("HFK", 4:6), paste0("HF-IC", 4:6), paste0("HFK-IC", 4:6))
peak.data.hfk <-
    peak.data.hfk[names(peaks.all),
                          c("HFK4", "HFK5", "HFK6", "HFK-IC4", "HFK-IC5", "HFK-IC6")]
design(peak.data.hfk) <- ~ condition

peak.data.hfk <- DESeq(peak.data.hfk)
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
plotMA(peak.data.hfk,
       main = "MA plot for HFK over input\nin all peaks",
       xlab = "Mean of normalized counts")

res.hfk <- results(peak.data.hfk)

peaks.all$hfk.padj <- as.numeric(NA)
peaks.all$hfk.log2FC <- as.numeric(NA)
peaks.all[rownames(res.hfk), ]$hfk.padj <-
    res.hfk$padj
peaks.all[rownames(res.hfk), ]$hfk.log2FC <-
    res.hfk$log2FoldChange

peaks.hfk <- subset(peaks.all,
                    padj < padj.threshold & log2FC > 0 &
                    hfk.padj < padj.threshold &
                    hfk.log2FC > 0)
score(peaks.hfk) <- -log10(peaks.hfk$hfk.padj)
length(peaks.hfk)
## [1] 2871
length(subsetByOverlaps(peaks.hfk, filter.peaks))
## [1] 2871
summary(width(peaks.hfk))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   21.00   51.00   54.00   57.02   60.00  163.00
colnames(peak.data$peak.counts)[1:6] <- c(paste0("HF", 4:6), paste0("HFK", 4:6))
dds.hfk.hf <-
    peak.data$peak.counts[names(filter.peaks),
                                    c(paste0("HF", 4:6), paste0("HFK", 4:6))]
dds.hfk.hf$sampletype <-
    factor(c(rep("HF", 3), rep("HFK", 3)), levels = c("HF", "HFK"))

design(dds.hfk.hf) <- ~ sampletype 
dds.hfk.hf <- DESeq(dds.hfk.hf)
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res.hfk.hf <- results(dds.hfk.hf, contrast = c("sampletype", "HFK", "HF"))
plotMA(res.hfk.hf,
       main = "MA plot for HFK over HF\nin all peaks (significant over input)",
       xlab = "Mean of normalized counts")

summary(res.hfk.hf)
## 
## out of 3300 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 1009, 31%
## LFC < 0 (down)     : 902, 27%
## outliers [1]       : 0, 0%
## low counts [2]     : 0, 0%
## (mean count < 5)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
dds_transform <- varianceStabilizingTransformation(dds.hfk.hf)
rawCountTable_transform <- as.data.frame(assay(dds_transform))
pseudoCount_transform = log2(rawCountTable_transform + 1)
mat.dist = pseudoCount_transform
mat.dist = as.matrix(dist(t(mat.dist)))
mat.dist = mat.dist/max(mat.dist)
setwd("/Users/mizuhi/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/HEAP-CLIP/HEAP_12232019/Analysis/CLIP_Analysis/Data Visualization")
png('Hierchical_Clustering.png')
cim(mat.dist, symkey = FALSE, margins = c(6, 6))
suppressMessages(dev.off())
## quartz_off_screen 
##                 2

Final output is following: Hierchical Clustering

plotPCA(dds_transform, intgroup = "sampletype", ntop = 500) +   
  geom_text(aes(label=name), vjust = 2, hjust = -0.1) + xlim(-50,60) + ylim(-40,50)

peaks.all$hfk.hf.padj <- as.numeric(NA)
peaks.all$hfk.hf.log2FC <- as.numeric(NA)
peaks.all[rownames(res.hfk.hf), ]$hfk.hf.padj <-
    res.hfk.hf$padj
peaks.all[rownames(res.hfk.hf), ]$hfk.hf.log2FC <-
    res.hfk.hf$log2FoldChange

Save the datasets

saveRDS(peaks.all, "Datafiles/peaks-all-12232019.rds")
filter.peaks <- subset(peaks.all, padj < padj.threshold)
saveRDS(filter.peaks, "Datafiles/peaks-filtered-12232019.rds")

Map seed to peaks

mirna.info.family <- readRDS("mirna-info-family-seedmatches.rds")  
assignMirToPeaks <- function(miRNA = mirs, 
                             peaks = es.peaks.utr3,
                             database = mirna.info.family){
  require(BSgenome)
  require(CLIPanalyze)
  require(Biostrings)
  bsgenome <- load.bsgenome("mm10")
  peaks.seq <- get.seqs(bsgenome, peaks)
  peaks$seed.8mer <- as.character(NA)
  peaks$seed.7m8 <- as.character(NA)
  peaks$seed.7A1 <- as.character(NA)
  peaks$seed.6mer <- as.character(NA)
  
  for (i in 1:length(miRNA)){
    mir <- miRNA[i]
    #Prepare seed matches
    mir.6m <- as.character(database[database$miR.family %in% mir, ]$seedmatch.6)
    mir.7m8 <- as.character(database[database$miR.family %in% mir, ]$seedmatch.m8)
    mir.7mA <- as.character(database[database$miR.family %in% mir, ]$seedmatch.A1)
    mir.8m <- as.character(database[database$miR.family %in% mir, ]$seedmatch.8)
    #Filter peaks with 6mer matches
    match <- GRanges(vmatchPattern(mir.6m, peaks.seq))
    #Go back to peaks and map the seed match and extend 1 nt on both directions
    match.extend <- peaks[seqnames(match),]
    match.strand <- as.logical(strand(match.extend) == "+")
    
    start(match.extend[match.strand,]) <- start(match.extend[match.strand,]) + start(match[match.strand,]) - 2
    match.extend[match.strand,] <- resize(match.extend[match.strand,], fix = "start", width = 8)
    
    end(match.extend[!match.strand,]) <- end(match.extend[!match.strand,]) - start(match[!match.strand,]) + 2
    match.extend[!match.strand,] <- resize(match.extend[!match.strand,], fix = "start", width = 8)
    #Assign seed types to these matches
    match.extend.seq <- get.seqs(bsgenome, match.extend)
    match.extend.seq.df <- data.frame(Peaks = names(match.extend.seq),
                                      Sequence = as.character(match.extend.seq))
    
    for (j in 1:nrow(match.extend.seq.df)){
      peak.name <- as.character(match.extend.seq.df[j, "Peaks"])
      seq <- as.character(match.extend.seq.df[j, "Sequence"])
      if (grepl(mir.8m, seq)){
        peaks[peak.name, ]$seed.8mer <- ifelse(is.na(peaks[peak.name, ]$seed.8mer),
                                               mir, 
                                               paste(peaks[peak.name, ]$seed.8mer, mir, sep = ", "))
      } else if (grepl(mir.7m8, seq)){
        peaks[peak.name, ]$seed.7m8 <- ifelse(is.na(peaks[peak.name, ]$seed.7m8),
                                              mir, 
                                              paste(peaks[peak.name, ]$seed.7m8, mir, sep = ", "))
      } else if (grepl(mir.7mA, seq)){
        peaks[peak.name, ]$seed.7A1 <- ifelse(is.na(peaks[peak.name, ]$seed.7A1),
                                              mir, 
                                              paste(peaks[peak.name, ]$seed.7A1, mir, sep = ", "))
      } else {
        peaks[peak.name, ]$seed.6mer <- ifelse(is.na(peaks[peak.name, ]$seed.6mer),
                                                mir, 
                                                paste(peaks[peak.name, ]$seed.6mer, mir, sep = ", "))
      }
    }
  }
  
  return(peaks)
}

miRNAs with mean counts larger than 200 are selected and mapped to peaks

mirna.family.DGE <- readRDS("Datafiles/mirna-counts-deseq-by-family-12232019.rds")
mirs <- subset(mirna.family.DGE, baseMean > 200)
mirs <- mirs$miR.family
#peaks.mirs <- assignMirToPeaks(miRNA = mirs,
#                               peaks = filter.peaks,
#                               database = mirna.info.family)
#saveRDS(peaks.mirs, "Datafiles/peaks-mirs-200-12232019.rds")
peaks.mirs <- readRDS("Datafiles/peaks-mirs-200-12232019.rds")
targetofmiR <- function(peaks.mir = brain.peaks.mirs,
                        miRNA = "",
                        sitetype = "8mer"){
  peaks.mir.sub <- as.data.frame(peaks.mir[,c("log2FC", "padj", 
                                              "seed.8mer", "seed.7m8", "seed.7A1", "seed.6mer")])
  peaks.seedmatch <- lapply(c("seed.8mer", "seed.7m8", "seed.7A1", "seed.6mer"),
                            function(seed){
                              map <- peaks.mir.sub[grepl(miRNA, peaks.mir.sub[,seed]),]
                              map <- rownames(map)
                              map
                            })
  names(peaks.seedmatch) <- c("seed.8mer", "seed.7m8", "seed.7A1", "seed.6mer")

  if (sitetype == "8mer"){
    maps <- peaks.seedmatch[[1]]
  } else if (sitetype == "7mer_above"){
    maps <- unique(unlist(peaks.seedmatch[1:3]))
  } else if (sitetype == "7mer"){
    maps <- unique(unlist(peaks.seedmatch[2:3]))
  } else if (sitetype == "6mer"){
    maps <- peaks.seedmatch[[4]]
  } else {
    print("Please input site type as: 8mer, 7mer_above, 7mer or 6mer")
  }
    return(peaks.mir[maps])
                        }
mirs.peaks <- lapply(mirs,
                     function(mir){
                       targetofmiR(miRNA = mir,
                                   peaks.mir = peaks.mirs,
                                   sitetype = "7mer_above")
                     })
names(mirs.peaks) <- mirs
saveRDS(mirs.peaks, "Datafiles/miRNA-peaks-list-12232019.rds")
lens <- as.data.frame(sapply(mirs.peaks, function(x) length(x)))
mirs.peaks.log2FC.median <- sapply(mirs.peaks,
                            function(list){
                              log2FCs <- list$hfk.hf.log2FC
                              return(median(log2FCs))
                            })
mirs.peaks.log2FC.mean <- sapply(mirs.peaks,
                            function(list){
                              log2FCs <- list$hfk.hf.log2FC
                              return(mean(log2FCs))
                            })

mirs.targets.log2FC <- cbind(as.data.frame(mirs.peaks.log2FC.median),
                             as.data.frame(mirs.peaks.log2FC.mean),
                             lens)
colnames(mirs.targets.log2FC) <- c("targets_log2FC_median","targets_log2FC_mean", "N")

mirs.targets.log2FC <- merge(mirs.targets.log2FC, mirna.family.DGE[,c("log2FoldChange", "padj")], by = 0)
colnames(mirs.targets.log2FC)[1] <- "miR.family"

Show any miRNA that has more than 20 peaks matched

df <- subset(mirs.targets.log2FC, N > 20)
p <- ggplot(df, aes(x = log2FoldChange, y = targets_log2FC_median, label = miR.family, size = N)) +
  geom_point(colour = "#1C75BB", alpha = 0.6) +
  #scale_color_manual(fill = "yellow") +
  geom_hline(yintercept = 0, linetype = "dashed", size = 0.3) +
  geom_vline(xintercept = 0, linetype = "dashed", size = 0.3) +
  xlab("miRNA expression Fold change log2 (HFK / HF)") +
  ylab("Median of peak signal changes log2 (HFK / HF)") +
  theme_bw() +
      theme(panel.border = element_blank(),
      panel.background = element_blank(),
      panel.grid.major = element_blank(), 
      panel.grid.minor = element_blank(),
      axis.title.x = element_text(size=12, margin = margin(t = 10)),
      axis.title.y = element_text(size=12, margin = margin(r = 10)),
      axis.text = element_text(size=10),
      axis.line.y = element_line(size = 0.5),
      axis.line.x = element_line(size = 0.5),
      axis.ticks.x = element_line(size = 0),
      axis.ticks.y = element_line(size = 0.5),
      plot.title = element_text(hjust = 0.5, size = 12, face = "bold"))
p <- ggplotly(p)
p
p2 <- ggplot(df, aes(x = log2FoldChange, y = targets_log2FC_mean, label = miR.family, size = N)) +
  geom_point(colour = "#EC469A", alpha = 0.6) +
  #scale_color_manual(fill = "yellow") +
  geom_hline(yintercept = 0, linetype = "dashed", size = 0.3) +
  geom_vline(xintercept = 0, linetype = "dashed", size = 0.3) +
  xlab("miRNA expression Fold change log2 (HFK / HF)") +
  ylab("Mean of peak signal changes log2 (HFK / HF)") +
  theme_bw() +
      theme(panel.border = element_blank(),
      panel.background = element_blank(),
      panel.grid.major = element_blank(), 
      panel.grid.minor = element_blank(),
      axis.title.x = element_text(size=12, margin = margin(t = 10)),
      axis.title.y = element_text(size=12, margin = margin(r = 10)),
      axis.text = element_text(size=10),
      axis.line.y = element_line(size = 0.5),
      axis.line.x = element_line(size = 0.5),
      axis.ticks.x = element_line(size = 0),
      axis.ticks.y = element_line(size = 0.5),
      plot.title = element_text(hjust = 0.5, size = 12, face = "bold"))
p2 <- ggplotly(p2)
p2

If we only look at the top 40 highly expressed miRNAs

mirna.family.DGE <- mirna.family.DGE[order(-mirna.family.DGE$baseMean),]
mirs.40 <- rownames(mirna.family.DGE)[1:40]

df.40 <- df[df$miR.family %in% mirs.40,]
p3 <- ggplot(df.40, aes(x = log2FoldChange, y = targets_log2FC_median, label = miR.family, size = N)) +
  geom_point(colour = "#1C75BB", alpha = 0.6) +
  #scale_color_manual(fill = "yellow") +
  geom_hline(yintercept = 0, linetype = "dashed", size = 0.3) +
  geom_vline(xintercept = 0, linetype = "dashed", size = 0.3) +
  xlab("Top 40 miRNA expression Fold change log2 (HFK / HF)") +
  ylab("Median of peak signal changes log2 (HFK / HF)") +
  theme_bw() +
      theme(panel.border = element_blank(),
      panel.background = element_blank(),
      panel.grid.major = element_blank(), 
      panel.grid.minor = element_blank(),
      axis.title.x = element_text(size=12, margin = margin(t = 10)),
      axis.title.y = element_text(size=12, margin = margin(r = 10)),
      axis.text = element_text(size=10),
      axis.line.y = element_line(size = 0.5),
      axis.line.x = element_line(size = 0.5),
      axis.ticks.x = element_line(size = 0),
      axis.ticks.y = element_line(size = 0.5),
      plot.title = element_text(hjust = 0.5, size = 12, face = "bold"))
p3 <- ggplotly(p3)
p3
p4 <- ggplot(df.40, aes(x = log2FoldChange, y = targets_log2FC_mean, label = miR.family, size = N)) +
  geom_point(colour = "#EC469A", alpha = 0.6) +
  #scale_color_manual(fill = "yellow") +
  geom_hline(yintercept = 0, linetype = "dashed", size = 0.3) +
  geom_vline(xintercept = 0, linetype = "dashed", size = 0.3) +
  xlab("Top 40 miRNA expression Fold change log2 (HFK / HF)") +
  ylab("Mean of peak signal changes log2 (HFK / HF)") +
  theme_bw() +
      theme(panel.border = element_blank(),
      panel.background = element_blank(),
      panel.grid.major = element_blank(), 
      panel.grid.minor = element_blank(),
      axis.title.x = element_text(size=12, margin = margin(t = 10)),
      axis.title.y = element_text(size=12, margin = margin(r = 10)),
      axis.text = element_text(size=10),
      axis.line.y = element_line(size = 0.5),
      axis.line.x = element_line(size = 0.5),
      axis.ticks.x = element_line(size = 0),
      axis.ticks.y = element_line(size = 0.5),
      plot.title = element_text(hjust = 0.5, size = 12, face = "bold"))
p4 <- ggplotly(p4)
p4

Here are the top 10 miRNAs with the most peaks associated.

color.vec <- brewer.pal(name = "Spectral", n = 11)
df <- as.data.frame(filter.peaks)
df$hfk.hf.logP <- -log10(df$hfk.hf.padj)
len <- sapply(mirs.peaks, function(x) length(x))
mirs.peaks <- mirs.peaks[order(-len)]
mirna <- names(mirs.peaks)
mirs.peaks.names <- lapply(mirs.peaks, function(x) names(x))

for (i in 1:10){
  p <- ggplot() + 
    geom_point(data = df, aes(x = hfk.hf.log2FC, y = hfk.hf.logP), size = 1, alpha = 0.5, color = "grey80") +
    geom_point(data = df[mirs.peaks.names[[mirna[i]]],], aes(x = hfk.hf.log2FC, y = hfk.hf.logP), size = 1, alpha = 0.7, color = "#7570B3") +
    geom_hline(yintercept = -log10(0.05), linetype = "dashed", size = 0.3) +
    geom_vline(xintercept = c(-0.5, 0.5), linetype = "dashed", size = 0.3) +
    xlab("Fold change log2 (HFK / HF)") +
    ylab("-log10(FDR)") +
    theme_bw() +
    theme(panel.border = element_blank(),
      panel.background = element_blank(),
      panel.grid.major = element_blank(), 
      panel.grid.minor = element_blank(),
      axis.title.x = element_text(size=12, margin = margin(t = 10)),
      axis.title.y = element_text(size=12, margin = margin(r = 10)),
      axis.text = element_text(size=10),
      axis.line.y = element_line(size = 0.5),
      axis.line.x = element_line(size = 0.5),
      axis.ticks.x = element_line(size = 0),
      axis.ticks.y = element_line(size = 0.5),
      plot.title = element_text(hjust = 0.5, size = 12, face = "bold")) +
    #guides(fill = guide_legend(title = "miRNA family")) +
    ggtitle(sprintf("%s targets", mirna[i ]))
  print(p)
  
}

I just want a table of all top 4o miRNAs and the number of peaks that are associated with them with LFC (HFK/HF) > 0 and < 0.

peak.number <- as.data.frame(table(mirs.peaks[[1]]$hfk.hf.log2FC > 0))
peak.number <- t(peak.number[,-1])
colnames(peak.number) <- c("LFC(HFK/HF) < 0", "LFC(HFK/HF) > 0")
for (i in 2: 40) {
  new.peak <- as.data.frame(table(mirs.peaks[[i]]$hfk.hf.log2FC > 0))
  new.peak <- t(new.peak[,-1])
  peak.number <- rbind(peak.number, new.peak)
}

rownames(peak.number) <- names(mirs.peaks)[1:40]
peak.number <- rbind(peak.number, c(sum(df$hfk.hf.log2FC > 0), sum(df$hfk.hf.log2FC < 0)))
rownames(peak.number)[dim(peak.number)[1]] <- "Total"
peak.number
##                                      LFC(HFK/HF) < 0 LFC(HFK/HF) > 0
## let-7-5p/miR-98-5p                               172             184
## miR-194-5p                                        92             135
## miR-200-3p/429-3p                                 88             113
## miR-15-5p/16-5p/195-5p/322-5p/497-5p             104              90
## miR-29-3p                                         75              82
## miR-24-3p                                         83              62
## miR-22-3p                                         70              59
## miR-103-3p/107-3p                                 71              53
## miR-27-3p                                         60              59
## miR-26-5p                                         52              61
## miR-17-5p/20-5p/93-5p/106-5p                      49              59
## miR-141-3p                                        56              49
## miR-196-5p                                        45              59
## miR-23-3p                                         49              40
## miR-30-5p/384-5p                                  57              26
## miR-484                                           42              38
## miR-19-3p                                         41              33
## miR-96-5p                                         28              39
## miR-7-5p                                          23              41
## miR-182-5p                                        26              35
## miR-3058-5p                                       28              32
## miR-210-3p                                        29              29
## miR-147-3p                                        35              23
## miR-148-3p/152-3p                                 37              21
## miR-205-5p                                        32              22
## miR-25-3p/32-5p/92-3p/363-3p/367-3p               30              19
## miR-130-3p/301-3p                                 24              21
## miR-30a-3p/30d-3p/30e-3p                          18              26
## miR-34-5p/449-5p                                  24              20
## miR-185-5p                                        18              25
## miR-194-2-3p/6926-5p/7055-5p                      14              29
## miR-378-3p                                        27              16
## miR-192-5p/215-5p                                 28              14
## miR-423-5p                                        18              22
## miR-671-5p                                        16              23
## miR-28-5p/708-5p                                  15              20
## miR-141-5p/6769b-3p                               22              12
## miR-181-5p                                        16              18
## miR-139-5p                                         9              22
## miR-21ac-5p                                       19              12
## Total                                           1732            1568